簡易數據分析

林嶔 (Lin, Chin)

Lesson 8

第一節:描述性統計(1)

– 請至這裡下載本週的範例資料

dat = read.csv("Example data.csv", header = TRUE, fileEncoding = 'CP950')
head(dat)
##       eGFR Disease Survival.time Death Diabetes Cancer      SBP      DBP
## 1 34.65379       1     0.4771037     0        0      1 121.2353 121.3079
## 2 37.21183       1     3.0704424     0        1      1 122.2000 122.6283
## 3 32.60074       1     0.2607117     1        0      0 118.9136 121.7621
## 4 29.68481       1            NA    NA        0      0 118.2212 112.7043
## 5 28.35726       0     0.1681673     1        0      0 116.7469 115.7705
## 6 33.95012       1     1.2238556     0        0      0 119.9936 116.3872
##   Education Income
## 1         2      0
## 2         2      0
## 3         0      0
## 4         1      0
## 5         0      0
## 6         1      0
  1. 函數「mean()」可以幫助我們計算平均值
mean(dat$eGFR, na.rm = TRUE)
## [1] 34.14115
  1. 函數「sd()」可以幫助我們計算標準差
sd(dat$eGFR, na.rm = TRUE)
## [1] 7.062506
  1. 函數「var()」可以幫助我們計算變異數
var(dat$eGFR, na.rm = TRUE)
## [1] 49.87899
  1. 函數「median()」可以幫助我們計算變異數
median(dat$eGFR, na.rm = TRUE)
## [1] 34.33123
  1. 函數「quantile()」可以幫助我們計算百分位數
quantile(dat$eGFR, na.rm = TRUE)
##       0%      25%      50%      75%     100% 
## 10.00000 29.58404 34.33123 38.44965 60.00000
quantile(dat$eGFR, 0.5, na.rm = TRUE)
##      50% 
## 34.33123
quantile(dat$eGFR, 0.95, na.rm = TRUE)
##      95% 
## 45.86468
  1. 函數「min()」可以幫助我們找出最小值
min(dat$eGFR, na.rm = TRUE)
## [1] 10
  1. 函數「max()」可以幫助我們找出最大值
max(dat$eGFR, na.rm = TRUE)
## [1] 60
  1. 函數「table()」可以幫助我們產生列聯表
table(dat$Cancer)
## 
##   0   1 
## 576 402
table(dat$Cancer, dat$Diabetes)
##    
##       0   1
##   0 468  97
##   1 309  81
  1. 函數「prop.table()」可以幫助我們產生列聯表的百分比
tab1 = table(dat$Cancer)
prop.table(tab1)
## 
##         0         1 
## 0.5889571 0.4110429
tab2 = table(dat$Cancer, dat$Diabetes)
prop.table(tab2)
##    
##              0          1
##   0 0.49005236 0.10157068
##   1 0.32356021 0.08481675
prop.table(tab2, 1)
##    
##             0         1
##   0 0.8283186 0.1716814
##   1 0.7923077 0.2076923
prop.table(tab2, 2)
##    
##             0         1
##   0 0.6023166 0.5449438
##   1 0.3976834 0.4550562

第一節:描述性統計(2)

mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
## [1] 34.41452
mean(dat[dat[,"Disease"] == 0,]$eGFR, na.rm = TRUE)
## [1] 33.19008
paste(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), sep = "")
## [1] "34.4145222034604±7.1582590897065"
paste0(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE))
## [1] "34.4145222034604±7.1582590897065"
m = mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
s = sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
formatC(m, digits = 3, format = "f")
## [1] "34.415"
formatC(s, digits = 3, format = "f")
## [1] "7.158"
paste0(formatC(m, digits = 3, format = "f"), "±", formatC(s, digits = 3, format = "f"))
## [1] "34.415±7.158"

練習1:學會利用自訂函數組合出想要的數據格式

  1. 函數之input必須為一類別變項及一連續變項

  2. 函數中必須設定一個變項,讓使用者能決定顯示小數點的位數

3-1. 第一個函數必須輸出在該類別變項為不同類別時,該連續變項之『平均數±標準差』

3-2. 第二個函數必須輸出在該類別變項為不同類別時,該連續變項之『中位數(25百分位-75百分位)』

– 注意:請考慮類別數不只是2的情形,以及類別項目並非為0、1、2、…的狀況

練習1答案

func.1 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  result = rep(NA, length(lvl.x))
  
  for (i in 1:length(lvl.x)) {
    m = mean(y[x==lvl.x[i]], na.rm = TRUE)
    s = sd(y[x==lvl.x[i]], na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "±", formatC(s, digits = dig, format = "f"))
  }
  
  result
  
}

func.1(x = dat[,"Disease"], y = dat[,"eGFR"])
## [1] "33.190±6.404" "34.415±7.158"
func.1(x = dat[,"Education"], y = dat[,"SBP"], dig = 1)
## [1] "121.4±10.6" "121.4±10.9" "122.9±10.5"
func.2 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  result = rep(NA, length(lvl.x))
  
  for (i in 1:length(lvl.x)) {
    m = median(y[x==lvl.x[i]], na.rm = TRUE)
    q.25 = quantile(y[x==lvl.x[i]], 0.25, na.rm = TRUE)
    q.75 = quantile(y[x==lvl.x[i]], 0.75, na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "(", formatC(q.25, digits = dig, format = "f"), "-", formatC(q.75, digits = dig, format = "f"), ")")
  }
  
  result
  
}

func.2(x = dat[,"Disease"], y = dat[,"eGFR"])
## [1] "33.193(28.971-37.238)" "34.663(29.729-38.820)"
func.2(x = dat[,"Education"], y = dat[,"SBP"], dig = 1)
## [1] "121.4(114.8-127.8)" "121.8(114.5-128.9)" "122.8(115.8-129.2)"

第二節:線性迴歸(1)

\[\hat{y_i} = b_{0} + b_{1}x_i\\\]

\[y_i = b_{0} + b_{1}x_i + \epsilon\\\]

\[\epsilon = y_i - b_{0} - b_{1}x_i\\\]

– 接著再定義找出一組\(b_0\)\(b_1\)使\(\sum{\epsilon}^2\)最小化:

\[\mbox{min}(\sum{\epsilon}^2) = \mbox{min}(\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2)\]

第二節:線性迴歸(2)

\[ \begin{align} \frac{\partial}{\partial b_0} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 & = 0 \\ \frac{\partial}{\partial b_1} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 & = 0 \end{align} \]

– 對\(b_0\)偏微分的求解過程如下:

\[ \begin{align} \frac{\partial}{\partial b_0} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 &= 2\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i) \frac{\partial}{\partial b_0} (y_i - b_{0} - b_{1}x_i)\\ &= 2\sum\limits_{i=1}^{n}-(y_i - b_{0} - b_{1}x_i) \end{align} \]

– 對\(b_1\)偏微分的求解過程如下:

\[ \begin{align} \frac{\partial}{\partial b_1} \sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i)^2 &= 2\sum\limits_{i=1}^{n}(y_i - b_{0} - b_{1}x_i) \frac{\partial}{\partial b_1} (y_i - b_{0} - b_{1}x_i)\\ &= 2\sum\limits_{i=1}^{n}-x_i(y_i - b_{0} - b_{1}x_i) \end{align} \]

\[ \begin{align} \sum\limits_{i=1}^{n}( b_{0} + b_{1}x_i - y_i) &= 0\\ \sum\limits_{i=1}^{n}( b_{0}x_i + b_{1}x_i^2 - y_ix_i) &= 0 \end{align} \]

第二節:線性迴歸(3)

\[ \begin{align} b_{1} &= \frac{\sum\limits_{i=1}^{n}y_ix_i-\frac{\sum\limits_{i=1}^{n}y_i\sum\limits_{i=1}^{n}x_i}{n}}{\sum\limits_{i=1}^{n}x_ix_i-\frac{\sum\limits_{i=1}^{n}x_i\sum\limits_{i=1}^{n}x_i}{n}} \\ b_{0} &= \frac{\sum\limits_{i=1}^{n}y_i}{n} - b_{1} \frac{\sum\limits_{i=1}^{n}x_i}{n} \end{align} \]

\[ \begin{align} b_{1} &= \frac{cov(x,y)}{var(x)} \\ b_{0} &= mean(y) - b_{1} \cdot mean(x) \end{align} \]

第二節:線性迴歸(4)

x = c(1, 2, 3, 4, 5)
y = c(6, 7, 9, 8, 10)

b1 = cov(x, y)/var(x)
b1
## [1] 0.9
b0 = mean(y) - b1 * mean(x)
b0
## [1] 5.3
model = lm(y~x)
model
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##         5.3          0.9
model$coefficients
## (Intercept)           x 
##         5.3         0.9

第二節:線性迴歸(5)

lm(SBP~DBP, data = dat)
## 
## Call:
## lm(formula = SBP ~ DBP, data = dat)
## 
## Coefficients:
## (Intercept)          DBP  
##      9.9012       0.9189
lm(dat[,"eGFR"]~dat[,"SBP"])
## 
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"])
## 
## Coefficients:
##  (Intercept)  dat[, "SBP"]  
##     -44.2474        0.6443
lm(dat[,"eGFR"] ~ dat[,"SBP"] + dat[,"DBP"])
## 
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"] + dat[, "DBP"])
## 
## Coefficients:
##  (Intercept)  dat[, "SBP"]  dat[, "DBP"]  
##     -49.2204        0.4562        0.2293
lm(dat[,"eGFR"] ~ ., data = dat[,c("SBP", "DBP"),drop=FALSE])
## 
## Call:
## lm(formula = dat[, "eGFR"] ~ ., data = dat[, c("SBP", "DBP"), 
##     drop = FALSE])
## 
## Coefficients:
## (Intercept)          SBP          DBP  
##    -49.2204       0.4562       0.2293

練習2:學會使用線性迴歸得到的結果產生預測公式

\[\hat{eGFR_i} = b_{0} + b_{1}SBP_i + b_{2}DBP_i\\\]

\[\hat{eGFR_i} = b_{0} + b_{1}DBP_i\\\]

練習2答案

– 這裡只展示第二種解法的函數:

pred_func = function (SBP = NA, DBP = NA) {
  
  if (is.na(SBP) & is.na(DBP)) {stop('需要至少輸入一個數值')} else {
    
    var_name = c('SBP', 'DBP')
    var_vec = c(1, SBP, DBP)
    
    var_name = var_name[!is.na(var_vec)[-1]]
    var_vec = var_vec[!is.na(var_vec)]
    
    model = lm(dat[,"eGFR"] ~ ., data = dat[,var_name,drop=FALSE])
    
    result = model$coefficients %*% var_vec
    result
    
  }
  
}

pred_func(SBP = 100)
##         [,1]
## [1,] 20.1858
pred_func(DBP = 100)
##          [,1]
## [1,] 20.21421
pred_func(SBP = 100, DBP = 100)
##          [,1]
## [1,] 19.32629

第三節:在線性迴歸中使用類別變項(1)

– 答案是可以的,對於已經編碼成0、1的變項我們可以直接把他放入線性迴歸,但是0、1、2的變項我們就不能直接放入線性迴歸之中,請想想為什麼?

x = c(0, 1, 2, 1, NA, 2)
coding_x = matrix(c(0, 1, 0, 1, NA, 0,
                    0, 0, 1, 0, NA, 1), nrow = 6, ncol = 2)
coding_x
##      [,1] [,2]
## [1,]    0    0
## [2,]    1    0
## [3,]    0    1
## [4,]    1    0
## [5,]   NA   NA
## [6,]    0    1

第三節:在線性迴歸中使用類別變項(2)

x = c(0, 1, 2, 1, NA, 2)

lvl.cat = levels(factor(x))
n.cat = length(lvl.cat)
coding_x = matrix(0, nrow = length(x), ncol = n.cat - 1)

for (i in 1:(n.cat-1)) {
  coding_x[x == lvl.cat[i+1],i] = 1
}

coding_x[is.na(x),] = NA

coding_x
##      [,1] [,2]
## [1,]    0    0
## [2,]    1    0
## [3,]    0    1
## [4,]    1    0
## [5,]   NA   NA
## [6,]    0    1
x = c(0, 1, 2, 1, NA, 2)
coding_x = model.matrix(~as.factor(x))
coding_x
##   (Intercept) as.factor(x)1 as.factor(x)2
## 1           1             0             0
## 2           1             1             0
## 3           1             0             1
## 4           1             1             0
## 6           1             0             1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`as.factor(x)`
## [1] "contr.treatment"
new_index = 1:length(x)
new_index[is.na(x)] = NA
new_index[!is.na(new_index)] = 1:sum(!is.na(x))
coding_x = coding_x[new_index,-1]
rownames(coding_x) = 1:nrow(coding_x)
one_hot_coding = function (x) {
  
  coding_x = model.matrix(~as.factor(x))
  new_index = 1:length(x)
  new_index[is.na(x)] = NA
  new_index[!is.na(new_index)] = 1:sum(!is.na(x))
  coding_x = coding_x[new_index,-1]
  rownames(coding_x) = 1:nrow(coding_x)
  coding_x
  
}

x = c(0, 1, 2, 1, NA, 2)
one_hot_coding(x)
##   as.factor(x)1 as.factor(x)2
## 1             0             0
## 2             1             0
## 3             0             1
## 4             1             0
## 5            NA            NA
## 6             0             1

第三節:在線性迴歸中使用類別變項(3)

coding_Income = one_hot_coding(dat[,"Income"])
coding_Education = one_hot_coding(dat[,"Education"])

dat1 = cbind(dat, coding_Income, coding_Education)

lm(dat[,"eGFR"] ~ ., data = dat1[,c(7:8, 11:14)])
## 
## Call:
## lm(formula = dat[, "eGFR"] ~ ., data = dat1[, c(7:8, 11:14)])
## 
## Coefficients:
##       (Intercept)                SBP                DBP  
##          -49.5690             0.4558             0.2305  
##   `as.factor(x)1`    `as.factor(x)2`  `as.factor(x)1.1`  
##            0.3370             1.2791             0.1097  
## `as.factor(x)2.1`  
##            0.2518
lm(dat[,"eGFR"] ~ dat[,"SBP"] + dat[,"DBP"] + factor(dat[,"Income"]) + factor(dat[,"Education"]))
## 
## Call:
## lm(formula = dat[, "eGFR"] ~ dat[, "SBP"] + dat[, "DBP"] + factor(dat[, 
##     "Income"]) + factor(dat[, "Education"]))
## 
## Coefficients:
##                 (Intercept)                 dat[, "SBP"]  
##                    -49.5690                       0.4558  
##                dat[, "DBP"]     factor(dat[, "Income"])1  
##                      0.2305                       0.3370  
##    factor(dat[, "Income"])2  factor(dat[, "Education"])1  
##                      1.2791                       0.1097  
## factor(dat[, "Education"])2  
##                      0.2518

練習3:擴充剛剛的預測函數

– 同樣的,假設使用者不願意填寫Income或是Education的情形,那就請你用剩餘的資訊進行預測。

– 這裡你就可以發現,假設我們有n個變項可供使用者填答,那他填與不填的所有組合就共有\(2^n-1\)種,因此剛剛的第二種寫法又更為重要了,試著完成他吧!

練習3答案

dat = read.csv("Example data.csv", header = TRUE, fileEncoding = 'CP950')

coding_Income = one_hot_coding(dat[,"Income"])
coding_Education = one_hot_coding(dat[,"Education"])

dat1 = cbind(dat, coding_Income, coding_Education)
colnames(dat1)[11:14] = c('Income1', 'Income2', 'Education1', 'Education2')

pred_func = function (SBP = NA, DBP = NA, Income = NA, Education = NA) {
  
  if (is.na(SBP) & is.na(DBP) & is.na(Income) & is.na(Education)) {stop('需要至少輸入一個數值')} else {
    
    if (is.na(Income)) {
      coding_Income = c(NA, NA)
    } else if (Income == 0) {
      coding_Income = c(0, 0)
    } else if (Income == 1) {
      coding_Income = c(1, 0)
    } else if (Income == 2) {
      coding_Income = c(0, 1)
    }
    
    if (is.na(Education)) {
      coding_Education = c(NA, NA)
    } else if (Education == 0) {
      coding_Education = c(0, 0)
    } else if (Education == 1) {
      coding_Education = c(1, 0)
    } else if (Education == 2) {
      coding_Education = c(0, 1)
    }
    
    var_name = c('SBP', 'DBP', 'Income1', 'Income2', 'Education1', 'Education2')
    var_vec = c(1, SBP, DBP, coding_Income, coding_Education)
    
    var_name = var_name[!is.na(var_vec)[-1]]
    var_vec = var_vec[!is.na(var_vec)]
    
    model = lm(dat[,"eGFR"] ~ ., data = dat1[,var_name,drop=FALSE])
    
    result = model$coefficients %*% var_vec
    result
    
  }
  
}

pred_func(SBP = 100, DBP = 100, Income = 1)
##          [,1]
## [1,] 19.44494
pred_func(SBP = 100, DBP = 100, Education = 2)
##          [,1]
## [1,] 19.53829
pred_func(SBP = 100, DBP = 100, Income = 2, Education = 0)
##          [,1]
## [1,] 20.33689

小結